Visualize all Observations by State & Species

data(us.cities)

# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
  out <- us.cities %>% 
  filter(country.etc==s) %>%
  mutate(city = gsub(paste0(" ", s), "", name)) %>%
  arrange(-pop)
  if (s == "OR") {
    out <- out %>% 
      head() %>%
      filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
                           "Beaverton", "Springfield")))
  } else if (s == "CO") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
  } else if (s == "NC") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
  } else {
    out <- out %>% head()
  }
  out
})

# Load the map data
states <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))

# Load your data
data.files <- list.files("data/final", full.names = T)

df <- purrr::map_df(data.files, readRDS) 

caps.after.ws <- function(string) {
  gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}

# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
  st <- case_when(st.abbr == "CO" ~ "colorado",
                  st.abbr == "NC" ~ "north carolina",
                  st.abbr == "VT" ~ "vermont",
                  st.abbr == "OR" ~ "oregon",
                  T ~ "")
  
  title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec), 
                             "Observations, 2016-2019"))
  
  p <- ggplot(data = states %>% filter(region == st)) +
    geom_polygon(aes(x = long, y = lat, group = group),
                 fill = "#989875", color = "black") +
    geom_point(data = df %>% filter(state == st.abbr & common.name == spec), 
               aes(x = lon, y = lat), 
               size=1, alpha=.5, fill = "red", shape=21) +
    geom_point(data = top.cities %>% filter(country.etc == st.abbr), 
               aes(x=long, y=lat),
               fill="gold", color="black", size=3.5, shape = 21) + 
    geom_text(data = top.cities %>% filter(country.etc == st.abbr), 
              aes(x=long, y=lat, label=city),
              color="white", hjust=case_when(st.abbr=="NC"~.2,
                                               st.abbr=="VT"~.65,
                                               T~.5),
              vjust=ifelse(st.abbr=="NC", -.65, 1.5),
              size=4) + 
    coord_map() +
    ggtitle(title) +
    theme_minimal() +
    theme(panel.background = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          panel.grid = element_blank())

  data.table(
    state=st.abbr,
    species=spec,
    plot=list(p)
  )
}

spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
  rename(spec=Var1, st.abbr=Var2) 

# Create a list of plots
plots <- purrr::map2_df(spec.state$spec, 
                        spec.state$st.abbr, 
                        ~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Ruddy Duck"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Belted Kingfisher"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Wild Turkey plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Wild Turkey"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sharp-shinned Hawk"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Downy Woodpecker"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Cedar Waxwing"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sandhill Crane"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sanderling Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sanderling"]$plot, 
          list(nrow=2, ncol=2)))

Explore Explanatory Rasters

states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states

TODO: - terra::freq - terra::density - terra::layerCor

Principal Component Analysis

r.df <- map_df(states, function(s) {
  df <- r.list[[s]] %>% as.data.frame()
  names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
  df %>% setDT()
  df[, state := factor(s, levels=states)]
  df[apply(df, 1, function(.x) !any(is.na(.x)))]
}) 

# Custom function to process factor levels
clean.levels <- function(x) {
  # Remove non-alphanumeric characters and replace with underscores
  x <- gsub("[^a-zA-Z0-9]", "_", x)
  # Convert to lowercase
  x <- tolower(x)
  # Remove any leading or trailing underscores
  x <- gsub("^_|_$", "", x)
  x <- gsub("__", "_", x)
  x <- gsub("NLCD_Land_", "", x)
  return(x)
}

r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]

# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, 
                                      data = r.df[, .(NLCD_Land, state)])) %>%
  cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F]) 

names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))

# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
  as.data.frame() %>%
  filter(V1 == 1) %>%
  row.names()

if (length(uniq.1) >= 1) {
  df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}


pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")

res <- pca.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Factor Analysis for Mixed Data

famd.fit <- FAMD(r.df, graph=F)

ggpubr::ggarrange(plotlist=purrr::map(
  c("var", "quanti", "quali"), 
  ~plot.FAMD(famd.fit, choix=.x)),
  ncol=1, nrow=3)

res <- famd.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Pseudo-Absence Generation

# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)

# LOAD DATA 

# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states

# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# BUFFERING RASTER DATA 

dist <- 5e3

mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
                        dist=10000, u="m") {
  obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
  # Create a buffer around the point, ensuring correct units
  buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
  return(terra::rasterize(buf, mask.raster, update=T, field=1))
}

# For each observation point, you can now create a distance 
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
                               dist=10000, u="m") {
  # Create an empty raster with the same extent and resolution as r
  mask.raster <- terra::rast(r)
  # Recursively update mask.raster with additional buffered regions
  for(i in 1:nrow(obs.df)) {
    mask.raster <- mask.update(i, mask.raster, obs.df, 
                               obs.field=obs.field, 
                               dist=dist, u=u)
    gc()
  }
  return(mask.raster)
}

# Get masks by state, species
masks <- purrr::map(states, function(state) {
    specs <- sort(unique(obs.df$common.name))
    spec.masks <- purrr::map(specs, function(spec, st=state) {
      fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
      if (file.exists(fname)) {
        cat("Reading", spec, st, "mask from", fname, "\n")
        r.mask <- rast(fname)
      } else {
        cat("Computing", spec, st, "mask, and saving to", fname, "\n")
        r.mask <- get.buffered.zones(r=r.list[[st]], 
                                     obs.df=filter(obs.df, state == st & 
                                                     common.name == spec),
                                     dist=dist)
        terra::writeRaster(r.mask, fname, overwrite=T)
      }
      gc()
      r.mask
    }, .progress=T)
    names(spec.masks) <- specs
    spec.masks
  })
names(masks) <- states
# Set seed
set.seed(19)

# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n, 
                                species, state,
                                sample.crs=4326, min.n=500,
                                output.dir="artifacts/pseudo_absence_regions") {
  if (!dir.exists(output.dir)) dir.create(output.dir)
  output.path <- file.path(output.dir,
                           gsub(" |\\-", "_", 
                                paste0(
                                  paste(state, tolower(species), sep="_"), 
                                  ".tif")
                           ))
  if (!file.exists(output.path)) {
    # Get inverse mask;
    # Set NA cells to 0, keep 0 cells as 0, change other cells to 1
    r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
    # Set 0 to 1 and everything else to NA
    r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
    # Crop so that anything outside of the state is NA
    r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
    
    # Create a binary raster from r.original where valid values are 
    # set to 1 and NA values remain NA
    r.binary <- terra::lapp(r.original[[1]], 
                            fun = function(x) ifelse(!is.na(x), 1, NA))
    
    # Multiply the cropped raster by the binary raster to ensure 
    # outside values are set to NA
    r.final <- r.cropped * r.binary
    terra::writeRaster(r.final, output.path, overwrite=T)
  } else {
    r.final <- terra::rast(output.path)
  }
  
  # Convert the raster to SpatialPoints
  gdf <- terra::as.points(r.final) %>%
    st_as_sf() %>%
    st_transform(crs=sample.crs)
  if (nrow(gdf) > 0) {
    gdf <- gdf %>%
      filter(!is.na(layer)) %>%
      select(geometry)
  } else {
    return(gdf)
  }
  
  # Set to min.n size if n < min.n
  if (n < min.n) n <- min.n
  # Make sure there is sufficient available sample points
  if (n > nrow(gdf)) n <- nrow(gdf)
  
  # Randomly sample n points from the available (non-masked) space
  sample.idx <- sample(1:nrow(gdf), n)
  samples <- gdf[sample.idx,] %>%
    mutate(common.name = species, 
           state = state, 
           lon = NA, 
           lat = NA,
           observations=0)
  
  # Populate lon and lat values:
  coords <- st_coordinates(samples)
  samples$lon <- coords[, "X"]
  samples$lat <- coords[, "Y"]
  
  return(samples)
}

# Get totals by species and state
totals <- obs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(N=n(), .groups="keep")


if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
  dir.create(file.path("data", "final_pseudo_absence"))
}

if (!all(
  file.exists(
    paste0(file.path("data", "final_pseudo_absence", 
                     paste0("pa_", states, ".rds")))
  )
)) {
  # Create a list of pseudo absence points, by species and state,
  # where the sample number `n` >= 500 | `n` == the total observed
  # for each respective state and species
  pseudo.absence.pts <- list()
  for (st in states) {
    r.original <- r.list[[st]]
    r.masks <- masks[[st]]
    pseudo.absence.pts[[st]] <- list()
    for (spec in names(r.masks)) {
      r.mask <- r.masks[[spec]]
      n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
      cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
      pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
        r.original, r.mask, spec, st, n=n, sample.crs=4326)
      cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
    }
  }
  
  # Extract raster values for each point
  for (state in states) {
    out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
    if (!file.exists(out.file.all)) {
      r <- r.list[[state]]
      
      cat(sprintf("Extracting points to values for %s...\n", state))
      # Load observations shapefile
      geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
      rownames(geo.df) <- NULL
      
      geo.df.crs <- st_crs(geo.df)
      
      # Define target CRS and update
      target.crs <- st_crs(r)
      cat(sprintf("Updating CRS for %s...\n", state))
      geo.df <- st_transform(geo.df, target.crs)
      
      # Extract raster values
      for (r.name in names(r)) {
        cat("\tExtracting", r.name, "values for", state, "\n")
        x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
        geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
      }
      
      # Update crs back
      geo.df <- st_transform(geo.df, geo.df.crs)
      
      # Fix names; Filter NA values
      geo.df <- geo.df %>%
        filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
        suppressWarnings() 
      
      saveRDS(geo.df, out.file.all)
      cat("--------------\n")
    }
  }
}
# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# There might be some slight differences since there are occasionally NA values
abs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(psuedo.absence.N=n(), .groups="keep") %>%
  left_join(totals, by=c("state", "common.name")) %>%
  rename(observed.N = N) %>%
  knitr::kable()
state common.name psuedo.absence.N observed.N
CO Belted Kingfisher 4523 4551
CO Cedar Waxwing 3431 3446
CO Downy Woodpecker 7440 7511
CO Ruddy Duck 1715 1726
CO Sanderling 497 131
CO Sandhill Crane 1524 1532
CO Sharp-shinned Hawk 2241 2257
CO Wild Turkey 2597 2611
NC Belted Kingfisher 4042 4183
NC Cedar Waxwing 4059 4195
NC Downy Woodpecker 10415 10914
NC Ruddy Duck 1076 1106
NC Sanderling 488 311
NC Sandhill Crane 489 118
NC Sharp-shinned Hawk 1211 1254
NC Wild Turkey 2261 2372
OR Belted Kingfisher 5803 5837
OR Cedar Waxwing 8405 8446
OR Downy Woodpecker 8529 8576
OR Ruddy Duck 1996 2010
OR Sanderling 496 258
OR Sandhill Crane 2443 2458
OR Sharp-shinned Hawk 2696 2714
OR Wild Turkey 2429 2440
VT Belted Kingfisher 1956 2033
VT Cedar Waxwing 1098 3938
VT Downy Woodpecker 1598 4635
VT Ruddy Duck 490 51
VT Sanderling 493 39
VT Sandhill Crane 492 76
VT Sharp-shinned Hawk 730 748
VT Wild Turkey 2090 2181

Spatial Autocorrelation

Moran’s I

This is a common measure of global spatial autocorrelation. A positive Moran’s I suggests clustering, a negative value suggests dispersion, and a value near zero suggests randomness. In other words, this is randomization approach to test for spatial autocorrelation. It is essentially checking if the data has a spatial pattern that is significantly different from what would be expected if the values were randomly distributed in space.

Understanding the Results

Moran I statistic: This is the calculated Moran’s I value for the data, which quantifies the degree of spatial autocorrelation. A positive value indicates positive autocorrelation (similar values are closer together), while a negative value indicates negative autocorrelation (dissimilar values are closer together). A value close to zero indicates a random spatial pattern.

Moran I statistic standard deviate: This value represents the standardized Moran’s I value. The larger the absolute value of this statistic, the stronger the evidence against the null hypothesis (of no spatial autocorrelation). A positive value indicates positive spatial autocorrelation, suggesting clustering of similar values.

P-value: This is the probability of observing a Moran’s I value as extreme as, or more extreme than, the one computed from the data, assuming the null hypothesis of no spatial autocorrelation is true. A very small p-value provides strong evidence to reject the null hypothesis, indicating significant spatial autocorrelation in your data.

Expectation: This is the expected Moran’s I value under the null hypothesis of no spatial autocorrelation. For a large dataset, it’s typically close to zero.

Variance: This is the variance of the Moran’s I statistic under the null hypothesis.

Spatial Autocorrelation of Observations

if (!dir.exists("artifacts/obs_autocor_morans")) {
  dir.create("artifacts/obs_autocor_morans")
}

# Get observation data
df <- list.files(file.path("data", "final"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observations, geometry) # We only interested in these data here

states <- sort(unique(df$state))
species <- sort(unique(df$common.name))

# Function to extract the desired results from output
extract.data <- function(st, spec, results) {
  data_frame(
    state = st,
    species = spec,
    statistic = results$statistic,
    p.value = results$p.value,
    moran.i.statistic = results$estimate['Moran I statistic'],
    expectation = results$estimate['Expectation'],
    variance = results$estimate['Variance']
  )
}

# Define the k for knn computation
k <- 5

if (!file.exists("artifacts/obs_autocor_morans/mi.results.rds")) {
  # Perform test by state, by species
  mi.results <- list()
  
  for (st in states) {
    mi.results[[st]] <- list()
    for (spec in species) {
      cat("Doing Moran's test for", spec, "in", st, "\n")
      # Filter data
      d <- df %>% filter(common.name == spec & state == st)
      mi.results[[st]][[spec]]$data <- d
      # Fit knn model
      knn <- d %>%
        knearneigh(k = k)
      mi.results[[st]][[spec]]$knn <- knn
      # Create a neighbor's list
      nb <- knn %>%
        knn2nb()
      mi.results[[st]][[spec]]$nb <- nb
      # Create a spatial weights matrix
      listw <- nb2listw(nb)
      mi.results[[st]][[spec]]$listw <- listw
      # Compute Moran's I
      results <- moran.test(d$observations, listw)
      mi.results[[st]][[spec]]$moran.test.results <- extract.data(st, spec, results)
    }
  }
  saveRDS(mi.results, "artifacts/obs_autocor_morans/mi.results.rds")
} else {
  mi.results <- readRDS("artifacts/obs_autocor_morans/mi.results.rds")
}

spec.stat <- expand.grid(species=species, state=states, stringsAsFactors=F)

mi.results.df <- purrr::map(1:nrow(spec.stat), function(i) {
    spec <- spec.stat[i, ]$species
    st <- spec.stat[i, ]$state
    mi.results[[st]][[spec]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

The Moran’s I statistic for the Belted Kingfisher and Cedar Waxwing are positive in Oregon, as well as the Sharp-shinned Hawk in both Vermont and Oregon. This suggests that locations with high observations of these species are close to other locations with high observations, and the same for low observations. In simple terms, the observation patterns of these species are clustered.

Below is an example of the actual observations of the Sharp-shinned Hawk in Vermont against the spatial lag (i.e., the weighted sum of the neighboring values at each point). Consider the following when interpreting the plot:

  • First Quadrant (top-right): High values surrounded by high values (indicating clustering of high values).
  • Second Quadrant (top-left): Low values surrounded by high values.
  • Third Quadrant (bottom-left): Low values surrounded by low values (indicating clustering of low values).
  • Fourth Quadrant (bottom-right): High values surrounded by low values.
# Get spatial weights list
vt.ssh <- mi.results[["VT"]][["Sharp-shinned Hawk"]]$data

# Calculate the spatial lag of the observations
vt.ssh$spatial.lag <- lag.listw(
  mi.results[["VT"]][["Sharp-shinned Hawk"]]$listw,
  vt.ssh$observations)

# Scatter plot for the Sharp-Shinned Hawk in Vermont
ggplot(vt.ssh, aes(x=observations, y=spatial.lag)) +
  geom_point() +
  geom_smooth(method="lm", col="red") + # Adds a linear regression line
  ggtitle("Moran Scatter Plot for Sharp-shinned Hawk in VT") +
  xlab("Observations (log10 Scale)") +
  ylab("Spatial Lag of Observations (log10 Scale)") +
  scale_y_log10() + scale_x_log10()

Spatial Autocorrelation of Environmental Factors

env.df.list <- list()
for (state in states) {
  r <- r.list[[state]]
  
  gdf <- terra::as.points(r) %>% 
    st_as_sf() %>%
    st_transform(crs=4326)
  # Fix names
  names(gdf) <- gsub(paste0("_", state, "$"), "", names(gdf))
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=gdf, sep=".") %>% 
    predict(., gdf) %>%
    as.data.frame()
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  gdf <- gdf %>% select(-NLCD_Land) %>% cbind(binary.lc)
  
  env.df.list[[state]] <- gdf
}

# Perform test by state, by environmental variable
env.mi.results <- list()

output.dir <- file.path("artifacts", "env_autocor_morans")
if (!dir.exists(output.dir)) dir.create(output.dir)

for (st in states) {
  env.mi.results[[st]] <- list()
  gdf <- env.df.list[[st]]
  env.vars <- names(gdf)[names(gdf) != "geometry"]
  for (e in env.vars) {
    output.path <- file.path(output.dir, paste0(st, "_", e, ".rds"))
    if (!file.exists(output.path)) {
      cat("Doing Moran's test for", e, "in", st, "\n")
      # Filter data
      d <- gdf %>% select(!!sym(e))
      # env.mi.results[[st]][[e]]$data <- d
      n.distinct <- d %>% pull(!!sym(e)) %>% unique() %>% length()
      if (n.distinct > 1) {
        # Fit knn model
        knn <- d %>%
          knearneigh(k = k)
        # env.mi.results[[st]][[e]]$knn <- knn
        # Create a neighbor's list
        nb <- knn %>%
          knn2nb()
        # env.mi.results[[st]][[e]]$nb <- nb
        # Create a spatial weights matrix
        listw <- nb2listw(nb)
        # env.mi.results[[st]][[e]]$listw <- listw
        # Compute Moran's I
        results <- moran.test(d[[e]], listw)
        env.mi.results[[st]][[e]]$moran.test.results <- extract.data(st, e, results)
      }
      cat("\tSaving results...\n")
      saveRDS(env.mi.results[[st]][[e]]$moran.test.results, output.path)
    } else {
      cat("Getting Moran's test results for", e, "in", st, "\n")
      env.mi.results[[st]][[e]]$moran.test.results <- readRDS(output.path)
    }
  }
}


env.stat <- expand.grid(env=names(env.df.list$CO )[names(env.df.list$CO) != "geometry"], 
                        state=states, stringsAsFactors=F)

env.mi.results.df <- purrr::map(1:nrow(env.stat), function(i) {
    e <- env.stat[i, ]$env
    st <- env.stat[i, ]$state
    env.mi.results[[st]][[e]]$moran.test.results
  }) %>% 
  do.call("rbind", .) %>%
  as_tibble() %>%
  # Compute adjusted p-value, accounting for multiple comparisons
  mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
env.mi.results.df %>% 
  filter(adj.p.value <= 0.05) %>%
  arrange(-moran.i.statistic)

The weather layers in particular have Moran’s I values close to 1, which is the maximum possible value. This indicates a very strong spatial clustering of these variables.

Mitigating Potential Problems Due to Spatial Autocorrelation

  1. Collinearity
  2. Spatial Autocorrelation in Residuals
  3. Overfitting
  4. Non-Stationarity
Checking for Collinearity in Environmental Factors
all.columns <- unique(unlist(lapply(env.df.list, names)))
env.df <- lapply(env.df.list, function(df) {
  missing.cols <- setdiff(all.columns, names(df))
  for (col in missing.cols) {
    df[[col]] <- NA
  }
  return(df)
}) %>%
  do.call("rbind", .) %>%
  as.data.frame() %>%
  select(-geometry, -unclassified)

corr.matrix <- cor(env.df, use="na.or.complete")
eigenvalues <- eigen(corr.matrix)$values
ci.df <- tibble(
  variable=names(env.df),
  condition.index=sqrt(max(eigenvalues) / eigenvalues)
)

ci.df
# Extract the eigenvectors
eigenvectors <- eigen(corr.matrix)$vectors

threshold <- 30
large.ci.indices <- which(sqrt(max(eigenvalues) / eigenvalues) > threshold)

# Examine the eigenvectors
for (index in large.ci.indices) {
  cat(paste("Eigenvalue:", eigenvalues[index]), "\n")
  cat(paste("Condition Index:", 
                  sqrt(max(eigenvalues) / eigenvalues[index])), "\n")
  
  # Sorting eigenvector components by absolute magnitude for clarity
  ev <- eigenvectors[, index]
  sorted.ev <- sort(abs(ev), decreasing = T)
  ev.top <- sorted.ev[1:5] %>%
    as_tibble() %>%
    mutate(variable=rownames(corr.matrix)[order(-abs(ev))][1:5]) %>%
    select(variable, value)
  
  cat("\nTop 5 contributors to multicollinearity at the condition idx:\n")
  
  for (row in 1:5) {
    cat("\t", ev.top[row,]$variable, ":", signif(ev.top[row,]$value, 3), "\n")
  }
  
  cat("\n------------------------\n")
}
## Eigenvalue: 0.00135666182336175 
## Condition Index: 65.3042479620253 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   avg_prcp : 0.814 
##   tmax : 0.459 
##   tmin : 0.356 
##   dem : 0.0108 
##   Winter_NDVI : 0.00657 
## 
## ------------------------
## Eigenvalue: 1.97064586234132e-15 
## Condition Index: 54184234.4382929 
## 
## Top 5 contributors to multicollinearity at the condition idx:
##   shrub_scrub : 0.508 
##   evergreen_forest : 0.5 
##   herbaceous : 0.473 
##   cultivated_crops : 0.318 
##   deciduous_forest : 0.207 
## 
## ------------------------
corrplot::corrplot(corr.matrix)

Feature Engineering

Setup

First, create a space to output any “engineered” rasters:

output.dir <- "artifacts/feature_engineering"
if (!dir.exists(output.dir)) dir.create(output.dir)

Use Hierarchical Categories for Land Cover

Recall the hierarchical categories for the land cover data:

  • Water:
    • 11: Open Water
    • 12: Perennial Ice/Snow
  • Developed
    • 21: Developed, Open Space
    • 22: Developed, Low Intensity
    • 23: Developed, Medium Intensity
    • 24: Developed, High Intensity
  • Barren
    • 31: Barren Land (Rock/Sand/Clay)
  • Forest
    • 41: Deciduous Forest
    • 42: Evergreen Forest
    • 43: Mixed Forest
  • Shrubland
    • 51: Dwarf Shrub
    • 52: Shurb/Scrub
  • Herbaceous
    • 71: Grassland/Herbaceous
    • 72: Sedge/Herbaceous
    • 73: Lichens
    • 74: Moss
  • Planted/Cultivated
    • 81: Pasture/Hay
    • 82: Cultivated Crops
  • Wetlands
    • 90: Woody Wetlands
    • 95: Emergent Herbaceous Wetlands

Using these categories, feature reduction can be accomplished using a heuristic methodology. Other redundant rasters, such as the Waterbody and Urban Imperviousness rasters, can also be combined with the respective related land cover rasters to further reduce multicollinearity between rasters.

create.parent.category.rasters <- function(input.raster, 
                                           state, 
                                           output.dir,
                                           layer.name="NLCD_Land") {
  # Define the category mappings
  categories <- list(
    water = c("Open Water"),
    ice.snow = c("Perennial Ice/Snow"),
    developed = c("Developed, Open Space",
                  "Developed, Low Intensity",
                  "Developed, Medium Intensity",  
                  "Developed, High Intensity"),
    barren = c("Barren Land"),
    forest = c("Mixed Forest", 
               "Deciduous Forest", 
               "Evergreen Forest"),
    shrubland = c("Shrub/Scrub", "Dwarf Shrub"),
    herbaceous = c("Herbaceous", "Grassland/Herbaceous",
                   "Sedge/Herbaceous", "Lichens", "Moss"),
    planted.cultivated = c("Cultivated Crops", "Hay/Pasture"),
    wetlands = c("Woody Wetlands", "Emergent Herbaceous Wetlands")
  )
  
  # Iterate through each category to create and save the binary raster
  for (cat.name in names(categories)) {
    # Define the output file path
    output.file <- file.path(output.dir, paste0(state, "_", cat.name, ".tif"))
    
    if (!file.exists(output.file)) {
      cat("Generating raster for", cat.name, "in", state, "\n")
      if (cat.name != "developed") {
        # Create a binary raster for the current category
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x %in% categories[[cat.name]] ~ 1.,
                                      T ~ 0)
                                  })
        names(out.raster) <- cat.name
        if (cat.name == "water") {
          # Combine with waterbody raster layer
          wb.raster <- input.raster[[paste0("waterbody_", state)]]
          out.raster <- (out.raster + wb.raster) / 2
          names(out.raster) <- "water"
        }
      } else {
        # Set developed raster to be a value, scale ranging from 0.25-1 by 0.25
        out.raster <- terra::lapp(input.raster[[layer.name]], 
                                  fun = function(x) {
                                    case_when(
                                      is.na(x) ~ NA,
                                      x == "Developed, Open Space" ~ 0.25,
                                      x == "Developed, Low Intensity" ~ 0.5,
                                      x == "Developed, Medium Intensity" ~ 0.75,
                                      x == "Developed, High Intensity" ~ 1.,
                                      T ~ 0)
                                  })
        # Combine with urban imperviousness
        ui.raster <- input.raster[[paste0("urban_imperviousness_", state)]]
        ui.min <- terra::minmax(ui.raster)[[1]]
        ui.max <- terra::minmax(ui.raster)[[2]]
        ui.raster.scaled <- (ui.raster - ui.min) / (ui.max - ui.min)
        out.raster <- (out.raster + ui.raster.scaled) / 2
        names(out.raster) <- "developed"
      }
      
      # Save the output raster to the specified output path
      writeRaster(out.raster, output.file, overwrite=T)
    }
  }
}

for (state in states) {
  create.parent.category.rasters(r.list[[state]], state, output.dir)
}

Aggregate Seasonal NDVI Rasters (Min/Max)

# Convert 4 separate NDVI rasters to a single raster
for (state in states) {
  output.file.min <- file.path(output.dir, paste0(state, "_min_NDVI.tif"))
  output.file.max <- file.path(output.dir, paste0(state, "_max_NDVI.tif"))
  if (!file.exists(output.file.min) | !file.exists(output.file.max)) {
    r.ndvi <- r.list[[state]][[names(r.list[[state]]) %like% "NDVI"]]
    # Parse seasons
    for (s in names(r.ndvi)) {
      r <- r.ndvi[[s]]
      .min <- terra::minmax(r)[[1]]
      .max <- terra::minmax(r)[[2]]
      # Scale to be from 0 to 1
      r.ndvi[[s]] <- (r - .min) / (.max - .min)
    }
    # Get min/max scaled NDVI values
    r.max <- max(r.ndvi)
    names(r.max) <- "max.ndvi"
    r.min <- min(r.ndvi)
    names(r.min) <- "min.ndvi"
    writeRaster(r.max, output.file.max, overwrite=T)
    writeRaster(r.min, output.file.min, overwrite=T)
  }
}

Combine Temperature Rasters (Difference)

# Get the difference between the min and max temperatures as a raster
for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_tdiff.tif"))
  if (!file.exists(output.file)) {
    # Get difference
    r.tdiff <- r.list[[state]][[paste0("tmax_", state)]] - 
      r.list[[state]][[paste0("tmin_", state)]]
    # Get min/max differences
    .min <- terra::minmax(r.tdiff)[[1]]
    .max <- terra::minmax(r.tdiff)[[2]]
    # Scale to be from 0 to 1
    r.tdiff <- (r.tdiff - .min) / (.max - .min)
    names(r.tdiff) <- "temp.diff"
    # Write raster
    writeRaster(r.tdiff, output.file, overwrite=T)
  }
}

Spatial Filters

These are components derived from the spatial coordinates, which can capture and account for spatial structures in the data. E.g., a polynomial term based on latitude and longitude could account for some of the spatial trend.

for (state in states) {
  output.file.lon <- file.path(output.dir, paste0(state, "_lon.tif"))
  output.file.lat <- file.path(output.dir, paste0(state, "_lat.tif"))
  output.file.lon2 <- file.path(output.dir, paste0(state, "_lon_poly.tif"))
  output.file.lat2 <- file.path(output.dir, paste0(state, "_lat_poly.tif"))
  output.file.lli <- file.path(output.dir, paste0(state, "_lon_lat_interaction.tif"))
  if (!all(file.exists(c(output.file.lon, output.file.lat, 
                         output.file.lon2, output.file.lat2,
                         output.file.lli)))) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y")
    
    # Initialize empty raster templates
    r.lat <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    r.lon <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with lat/lon values
    r.lat[r.df$cell] <- r.df$lat
    names(r.lat) <- "lat"
    r.lon[r.df$cell] <- r.df$lon
    names(r.lon) <- "lon"
    # Get polynomial and interaction terms
    r.lat2 <- r.lat^2 
    names(r.lat2) <- "lat.sqrd"
    r.lon2 <- r.lon^2 
    names(r.lon2) <- "lon.sqrd"
    r.lon.lat <- r.lon * r.lat
    names(r.lon.lat) <- "lon.lat.interaction"
    
    # Write outputs
    writeRaster(r.lat, output.file.lat, overwrite=T)
    writeRaster(r.lon, output.file.lon, overwrite=T)
    writeRaster(r.lat2, output.file.lat2, overwrite=T)
    writeRaster(r.lon2, output.file.lon2, overwrite=T)
    writeRaster(r.lon.lat, output.file.lli, overwrite=T)
  }
}

Detrend DEM

for (state in states) {
  output.file <- file.path(output.dir, paste0(state, "_detrend_dem.tif"))
  if (!file.exists(output.file)) {
    # Get raster as df
    r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>% 
      rename(lon=x, lat=y) %>%
      select(lon, lat, cell, !!sym(paste0("dem_", state))) %>%
      st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
      st_transform(crs=4326) %>%
      cbind(st_coordinates(.)) %>%
      rename(lon="X", lat="Y", dem=paste0("dem_", state)) %>%
      # Convert to data.table
      setDT()
    
    # Fit model based on location, with dem as response
    fit <- lm(dem ~ lat * lon + I(lat^2) + I(lon^2), 
              data=r.df[!is.na(dem)])
    # Get residuals from the model as "de-trended" dem values
    r.df[!is.na(dem), dem.detrended := residuals(fit)]
    # Scale de-trended values
    r.df[, dem.detrended := (dem.detrended - min(dem.detrended, na.rm=T)) / 
           (max(dem.detrended, na.rm=T) - min(dem.detrended, na.rm=T))]
    # Initialize empty raster template
    r.dem <- ext(r.list[[state]]) %>% 
            rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
    # Fill with de-trended dem values
    r.dem[r.df$cell] <- r.df$dem.detrended
    names(r.dem) <- "dem.detrended"
    
    # Write outputs
    writeRaster(r.dem, output.file, overwrite=T)
  }
}

Feature Selection

Load all Original Features and Feature Engineering Outputs

final.output.dir <- "artifacts/final_data"
if (!dir.exists(final.output.dir)) dir.create(final.output.dir)
final.fpath <- file.path(final.output.dir, "final_data.rds")

if (!file.exists(final.fpath)) {
  # Combine observation/pseudo-absence data
  df <- c(list.files(file.path("data", "final_pseudo_absence"), full.names = T),
          list.files(file.path("data", "final"), full.names = T)) %>%
    purrr::map_df(readRDS)
  
  # Get feature engineered raster data
  output.dir <- "artifacts/feature_engineered_final"
  if (!dir.exists(output.dir)) dir.create(output.dir)
  get.fe <- all(case_when(is_empty(list.files(output.dir)) ~ T,
                          list.files(output.dir) < length(states) ~ T,
                          T ~ F))
  fe.r.list <- list()
  if (get.fe) {
    for (state in states) {
      fe.r.list[[state]] <- list.files(
        "artifacts/feature_engineering", full.names = T)[
          grepl(
            paste0("^", state, "_"), 
            list.files("artifacts/feature_engineering", full.names = F)
          )
        ] %>% rast()
      # Save as multi-layer rasters by state
      writeRaster(fe.r.list[[state]], 
                  file.path(output.dir, paste0(state, ".tif")),
                  overwrite=T)
    }
  } else {
    for (state in states) {
      fe.r.list[[state]] <- rast(file.path(output.dir, paste0(state, ".tif")))
    }
  }
  
  # Add empty fields in dataframe for each new raster 
  purrr::map(fe.r.list, names) %>% 
    reduce(c) %>% 
    unique() %>% 
    sort() %>%
    purrr::walk(function(n) {
      if (!(n %in% names(df))) df[[n]] <<- 0
    })
  
  # Update point crs to match fe rasters
  df <- st_transform(df, st_crs(fe.r.list[[1]])) 
  df.list <- list()
  # From state multi-layer rasters, extract values from each point in df
  for (st in states) {
    cat(sprintf("Extracting points to values for %s...\n", st))
    r <- fe.r.list[[st]]
    df.list[[st]] <- df %>% filter(state == st)
    # Extract raster values
    for (r.name in names(r)) {
      cat("\tExtracting", r.name, "values for", st, "\n")
      x <- terra::extract(r[[r.name]], df.list[[st]])[[r.name]]
      df.list[[st]] <- mutate(df.list[[st]], !!r.name := x)
    }
  }
  # Combine list
  df <- do.call("rbind", df.list)
  rm(df.list)
  gc()
  # Update crs back
  df <- st_transform(df, 4326)
  # Remove rownames
  rownames(df) <- NULL
  
  # Convert land cover to binary variables
  binary.lc <- caret::dummyVars(~NLCD_Land, data=df, sep="") %>% 
    predict(., df) %>%
    as.data.frame()
  
  names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>% 
    gsub("\\/| |,", "_", .) %>% 
    gsub("__", "_", .) %>% 
    tolower()
  
  # Remove duplicates from feature engineering
  binary.lc <- binary.lc %>%
    select(-c("unclassified", "perennial_snow_ice", "barren_land",
              "shrub_scrub", "herbaceous"))
  # Combine with dataframe, remove land cover categorical var
  df <- df %>% select(-NLCD_Land) %>% cbind(binary.lc)
  saveRDS(df, final.fpath)
} else {
  df <- readRDS(final.fpath)
}

# Convert to data.table
df %>% setDT()

# View output
df %>% as_tibble()

Correlation

obs.cor <- purrr::map(sort(unique(df$common.name)), function(spec) {
  corr.matrix <- cor(select(df[common.name == spec], 
                            -c("state", "common.name", "geometry")))
  obs.cor <- corr.matrix[which(rownames(corr.matrix) == "observations"),] %>%
    as.data.frame()
  obs.cor$variable <- rownames(obs.cor)
  obs.cor %>%
    filter(variable != "observations") %>%
    rename(!!spec := `.`) %>%
    arrange(-abs(!!sym(spec))) %>%
    mutate(!!spec := paste0(variable, ": ", signif(!!sym(spec), 2))) %>%
    select(!!sym(spec)) 
}) %>% 
  do.call("cbind", .)

obs.cor %>% as_tibble()

Train/Test Split

# Set seed for splitting and modeling
set.seed(19)

stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
  # Cut along lat/lon values to create grids (lat.bin & lon.bin)
  # lat.lon.bins is the number of divisions you want
  df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
  df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
  df$absence <- df$observations == 0
  
  # Create a new variable combining the stratification variables
  df %>%
    # stratify on lat/lon bins, species, state, and absence
    mutate(strata = paste(lat.bin, lon.bin, common.name, state, absence)) %>%
    pull(strata) %>%
    # Create the data partitions
    createDataPartition(., p = p, list = F) %>%
    suppressWarnings()
}

prepare.data <- function(df, p=.7, lat.lon.bins=25) {
  train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
  df.train <- df[train.index, ]
  df.test <- df[-train.index, ]
  
  list(train = df.train, 
       test = df.test,
       index = train.index)
}

# Get train/test indices
train.test <- prepare.data(df, .7)

# Split; Remove non-predictive variables
df.train <- df[train.test$index,] 
df.train[, `:=` (geometry=NULL)]
df.test <- df[-train.test$index,]
df.test[, `:=` (geometry=NULL)]

train.x <- df.train %>% select(-observations)
train.y <- df.train$observations

test.x <- df.test %>% select(-observations)
test.y <- df.test$observations
# Get model from cache if it has been saved before
get.model <- function(model, file.name, model.path) {
  f.path <- file.path(model.path, file.name)
  if (!dir.exists(model.path)) {
    dir.create(model.path)
  }
  # Model cache check
  if (ifelse(file.exists(f.path), T, F)) {
    model <- readRDS(f.path)
  } else {
    saveRDS(model, f.path)
  }
  model
}
species <- sort(unique(df.train$common.name))
if (!dir.exists("artifacts/models")) dir.create("artifacts/models")

# Define the control for the train function
ctrl <- trainControl(method = "cv", number = 5)

lasso.list <- purrr::map(species, function(spec) {
  spec.state.fit <- purrr::map(states, function(st) {
    cat("Fitting LASSO model for", spec, "in", st, "\n")
    spec.df <- df.train[common.name == spec & state == st][
      , `:=` (state=NULL, common.name = NULL)]
    
    # Remove any columns where all values are the same
    .remove <- names(which(sapply(spec.df, function(x) length(unique(x)) < 1)))
    .remove <- .remove[.remove != "observations"]
    if (!is_empty(.remove)) {
      spec.df <- spec.df[, ...remove]
    }
    
    # Scale data
    # Identify fields to center/scale
    to.scale <- sapply(spec.df, function(x) is.numeric(x) && 
                         length(unique(x)) > 2)
    to.scale$observations <- F
    to.scale <- names(spec.df) %in% names(which(unlist(to.scale)))
    
    # Define the pre-processing steps (use the training data to avoid data leakage)
    # Apply centering and scaling to the non-binary fields and non-target
    preproc <- preProcess(spec.df[, ..to.scale], 
                          method = c("center", "scale"))
    
    # Center/Scale the training data
    df.train.scaled <- bind_cols(spec.df[, !(..to.scale)],
                                 predict(preproc, spec.df[, ..to.scale]))
    
    # LASSO (L1); Elastic Net, where alpha = 1
    fit.l1 <- get.model(
      train(observations ~ (.)^2, 
            data = df.train.scaled, 
            method = "glmnet",
            trControl = ctrl,
            tuneGrid = expand.grid(alpha = 1, 
                                   lambda = seq(0, 1, by = 0.1)),
            metric = "RMSE"),
      file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_regr_l1.rds"),
      model.path="artifacts/models/lasso_fs")
    
    gc()
    fit.l1
  })
  names(spec.state.fit) <- states
  spec.state.fit
})
## Fitting LASSO model for Belted Kingfisher in CO 
## Fitting LASSO model for Belted Kingfisher in NC 
## Fitting LASSO model for Belted Kingfisher in OR 
## Fitting LASSO model for Belted Kingfisher in VT 
## Fitting LASSO model for Cedar Waxwing in CO 
## Fitting LASSO model for Cedar Waxwing in NC 
## Fitting LASSO model for Cedar Waxwing in OR 
## Fitting LASSO model for Cedar Waxwing in VT 
## Fitting LASSO model for Downy Woodpecker in CO 
## Fitting LASSO model for Downy Woodpecker in NC 
## Fitting LASSO model for Downy Woodpecker in OR 
## Fitting LASSO model for Downy Woodpecker in VT 
## Fitting LASSO model for Ruddy Duck in CO 
## Fitting LASSO model for Ruddy Duck in NC 
## Fitting LASSO model for Ruddy Duck in OR 
## Fitting LASSO model for Ruddy Duck in VT 
## Fitting LASSO model for Sanderling in CO 
## Fitting LASSO model for Sanderling in NC 
## Fitting LASSO model for Sanderling in OR 
## Fitting LASSO model for Sanderling in VT 
## Fitting LASSO model for Sandhill Crane in CO 
## Fitting LASSO model for Sandhill Crane in NC 
## Fitting LASSO model for Sandhill Crane in OR 
## Fitting LASSO model for Sandhill Crane in VT 
## Fitting LASSO model for Sharp-shinned Hawk in CO 
## Fitting LASSO model for Sharp-shinned Hawk in NC 
## Fitting LASSO model for Sharp-shinned Hawk in OR 
## Fitting LASSO model for Sharp-shinned Hawk in VT 
## Fitting LASSO model for Wild Turkey in CO 
## Fitting LASSO model for Wild Turkey in NC 
## Fitting LASSO model for Wild Turkey in OR 
## Fitting LASSO model for Wild Turkey in VT
names(lasso.list) <- species

spec.state <- expand.grid(common.name=species, state=states, stringsAsFactors=F)

lasso.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
  spec <- spec.state[i,]$common.name
  st <- spec.state[i,]$state
  fit <- lasso.list[[spec]][[st]]
  coef.df <- coef(fit$finalModel, s = fit$bestTune$lambda) %>%
    as.matrix() %>%
    as.data.frame()
  # Remove the intercept
  coef.df <- coef.df[-1, , drop = F]
  
  # Create a data frame of variable importance
  var.importance <- tibble(
    common.name = spec,
    state = st,
    variable = rownames(coef.df),
    importance = abs(coef.df[,1])
  ) %>%
    # Rank variables by importance
    arrange(state, common.name, -importance, variable) %>%
    # Only look at variables where imp. > 0
    filter(importance > 0)
})

lasso.imp
# Decision Tree

Boruta

Random Permutations

Recursive Feature Elimination